In [1]:
    
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
    
In [2]:
    
os.listdir(os.getcwd())
    
    Out[2]:
In [3]:
    
# load data in
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
    
In [4]:
    
train.head(5)
    
    Out[4]:
In [5]:
    
# initial look at the data
print(train.describe())
print(train.dtypes)
    
    
In [6]:
    
# quite a fair bit of missing values
train.isnull().sum()
    
    Out[6]:
In [7]:
    
# start with sex
train.Sex.value_counts()
    
    Out[7]:
In [8]:
    
# convert sex to 1 (male) and 0 (female)
def sexconverter(row):
    if row['Sex'] == 'male':
        return 1
    else:
        return 0
train['Sex'] = train.apply(sexconverter, axis=1)
    
In [9]:
    
# only 7 observations of less than 1 year old
train[train.Age < 1]
    
    Out[9]:
In [10]:
    
# look at fare
# significant non-normality and right skewnewss
plt.figure(figsize=(10,10))
sns.distplot(train.Fare)
    
    Out[10]:
    
In [11]:
    
# fare and survival rate?
# already we can see that mainly men did not survive!
plt.figure(figsize=(10,10))
sns.swarmplot(x='Survived', y='Fare', hue='Sex', data=train)
    
    Out[11]:
    
In [12]:
    
# look at gender survival rates
tmp = pd.crosstab(index=train.Sex, columns=train.Survived, margins=True)
tmp
    
    Out[12]:
In [13]:
    
# frequency?
tmp_freq = pd.crosstab(index=train.Sex, columns=train.Survived, margins=True, normalize="index")
tmp_freq
    
    Out[13]:
In [14]:
    
# how about pclass and survival rates?
# use heatmap - mostly lower class people did not survive
plt.figure(figsize=(10,10))
tmp = pd.crosstab(index=train.Pclass, columns=[train.Survived, train.Sex])
sns.heatmap(tmp, cmap="plasma")
    
    Out[14]:
    
In [15]:
    
# how about age?
plt.figure(figsize=(10,10))
sns.violinplot(x="Survived", y="Age", data=train)
sns.swarmplot(x="Survived", y="Age", hue="Sex", alpha=0.5, data=train)
    
    Out[15]:
    
In [34]:
    
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import numpy as np
from sklearn.model_selection import train_test_split
    
In [17]:
    
clf = GaussianNB()
    
In [ ]:
    
# impute for missing Age values
train['Age'] = [np.mean(train.Age) if np.isnan(x) == True else x for x in train.Age]
    
In [29]:
    
X = train[['Sex','Age','Pclass']].values
y= train[['Survived']].values
    
In [30]:
    
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
    
In [32]:
    
clf.fit(X_train, y_train.ravel())
    
    Out[32]:
In [40]:
    
acc = metrics.accuracy_score(y_test, clf.predict(X_test))
print("Accuracy of GNB model is %.2f%%" % (acc*100))
    
    
In [41]:
    
# plot ROC curve
probs = clf.predict_proba(X_test)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr,tpr)
    
In [49]:
    
plt.figure(figsize=(15,15))
axis_font = {'fontname':'Arial', 'size':'22'}
plt.plot(fpr,tpr, 'b', label="ROC curve(area=%0.2f)" % roc_auc)
plt.plot([0,1],[0,1], "r--")
plt.xlabel("False Positive Rate",**axis_font)
plt.ylabel("True Positive Rate",**axis_font)
plt.legend(loc="lower right")
    
    Out[49]: